import pandas as pd
from univariateAnalysis import UniVariateAnalysis, UniVariateReport, OutlierFilter
from scipy.stats import zscore
import seaborn as sns
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
import matplotlib.pylab as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist
from sklearn.metrics import silhouette_samples, silhouette_score
df = pd.read_excel("Credit Card Customer Data.xlsx")
all_cols = ['Sl_No','Customer Key','Avg_Credit_Limit','Total_Credit_Cards','Total_visits_bank','Total_visits_online','Total_calls_made']
def print_all_uni_analysis_reports(df,columnNames):
seperator = '---------------------------------------------'
for column in columnNames:
analysis = UniVariateAnalysis(df, column)
analysis_report = UniVariateReport(analysis)
print(seperator)
print(f'\'{column}\' column univariate analysis report')
print(seperator)
analysis_report.print_report()
df.head()
df.info()
df.describe()
# check to see if there are any NaN
df.isnull().values.any()
As for scaling - we can see that all of the non 'total' columns are scaled quite differently than the other columns. Whereas, the 'total' columns range from 0-20, the other columns have a signficantly higher scale to them. We'll keep this in mind so these differences in scales doesnt affect our clustering in a negative way.
I'm not entirely sure what the 'Sl_No' is supposed to represent here, but I am assuming that it is a 'serial number' column. The 'Sl_No' along with the customer key column doesnt seem to really represent anything tangible that we would want to cluster off of, these values are mostly entirely system generated (based on the CIS system the bank uses). In a real world scenario, we would lean on business analysts and product people in order to gain a better understanding on what this means and what, if anything, information can be derived from these columns that would tell us more information about the customer. For now, I am going to remove this columsn when grabbing the scaled dataframes
print_all_uni_analysis_reports(df, all_cols)
outlier_filter = OutlierFilter(df, all_cols)
df_no_outliers = outlier_filter.get_df_without_outliers()
df_no_outliers.head()
df_no_outliers.describe()
df_no_outliers.info()
# Get scaled dataframes with and without outliers
df_customer_numbers_removed = df.iloc[:,2:]
df_customer_numbers_removed.head()
df_customer_numbers_removed_no_outliers = df_no_outliers.iloc[:,2:]
df_customer_numbers_removed_no_outliers.head()
df_scaled = df_customer_numbers_removed.apply(zscore)
df_scaled.head()
df_scaled_no_outliers = df_customer_numbers_removed_no_outliers.apply(zscore)
df_scaled_no_outliers.head()
Using a pair plot, we'll take a look at each diagonal graph in order to try to grasp how many clusters we may need
sns.pairplot(df_scaled, height=2, aspect=2, diag_kind='kde')
sns.pairplot(df_scaled_no_outliers, height=2, aspect=2, diag_kind='kde')
From previous analysis, lets check for the optimal number of clusters 1-5 since 4 is the most amount of clusters that we determined previously, we wont go much higher than that to check
number_of_clusters = [1,2,3,4,5]
def get_distortions(scaled_df, try_number_of_clusters):
distortions = []
for try_num in try_number_of_clusters:
model = KMeans(n_clusters=try_num)
model.fit(scaled_df)
prediction = model.predict(scaled_df)
distortions.append(sum(np.min(cdist(scaled_df, model.cluster_centers_, 'euclidean'), axis=1)) / scaled_df.shape[0])
return distortions
mean_distortions = get_distortions(df_scaled, number_of_clusters)
plt.plot(number_of_clusters, mean_distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
mean_distortions = get_distortions(df_scaled_no_outliers, number_of_clusters)
plt.plot(number_of_clusters, mean_distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
group_col = 'Group'
k_means_model = KMeans(3)
k_means_model.fit(df_scaled_no_outliers)
prediction = k_means_model.predict(df_scaled_no_outliers)
cluster_labels = k_means_model.fit_predict(df_scaled_no_outliers)
df_scaled_no_outliers_with_group = df_scaled_no_outliers.copy()
df_scaled_no_outliers_with_group[group_col] = prediction
df_scaled_no_outliers_with_group.head(20)
silhouette_avg = silhouette_score(df_scaled_no_outliers, cluster_labels)
print("The average silhouette_score for kmeans(3) :", silhouette_avg)
df_scaled_no_outliers_cluster = df_scaled_no_outliers_with_group.groupby([group_col])
df_scaled_no_outliers_cluster.mean()
df_scaled_no_outliers_with_group.boxplot(by=group_col, layout=(2,4), figsize=(15,10))
When looking at the total visits at the bank, online and calls made. There seems to be a inverse relationship between the three columns. That is, Group1 has the highest value of visits at the bank, but visits online and total calls made are low. Whereas, Group2 has high visits online and calls made but low average of totals visits at the bank. Group2 looks a lot like Group0
Keeping in mind the above statement, the difference between Group0 and Group2 is the average credit limit and the total number of credit cards seems to be close. So there may be some type of relationship or a centroid where if the customer has a higher amount of credit cards, they are more likely to to visit the bank in person versus calling or visiting the bank's website.
We can also see the data points cluster around certain average credit card limits that looks to be somewhat dependent on total credit cards, though as the total number of credit card increases, there is not a large increase in average credit limit, but the trend does seem to be there.
avg_clustering = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='average')
avg_clustering.fit(df_scaled_no_outliers)
avg_clustering_labels = avg_clustering.fit_predict(df_scaled_no_outliers)
complete_clustering = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete')
complete_clustering.fit(df_scaled_no_outliers)
complete_clustering_labels = complete_clustering.fit_predict(df_scaled_no_outliers)
ward_clustering = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage="ward")
ward_clustering.fit(df_scaled_no_outliers)
ward_clustering_labels = ward_clustering.fit_predict(df_scaled_no_outliers)
labels = 'labels'
df_scaled_no_outliers_with_avg_cluster_labels = df_scaled_no_outliers.copy()
df_scaled_no_outliers_with_avg_cluster_labels[labels] = avg_clustering.labels_
df_scaled_no_outliers_with_complete_cluster_labels = df_scaled_no_outliers.copy()
df_scaled_no_outliers_with_complete_cluster_labels[labels] = complete_clustering.labels_
df_scaled_no_outliers_with_ward_cluster_labels = df_scaled_no_outliers.copy()
df_scaled_no_outliers_with_ward_cluster_labels[labels] = ward_clustering.labels_
avg_sil_score = silhouette_score(df_scaled_no_outliers, avg_clustering_labels)
print("The average silhouette_score for hierarchical (average) :", avg_sil_score)
df_scaled_no_outliers_with_avg_cluster_labels.groupby([labels]).mean()
Z = linkage(df_scaled_no_outliers, metric='euclidean', method='average')
c, coph_distances = cophenet(Z, pdist(df_scaled_no_outliers))
c
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()
df_scaled_no_outliers_with_avg_cluster_labels.boxplot(by=labels, layout=(2,4), figsize=(15,10))
complete_sil_score = silhouette_score(df_scaled_no_outliers, complete_clustering_labels)
print("The average silhouette_score for hierarchical (complete) :", complete_sil_score)
df_scaled_no_outliers_with_complete_cluster_labels.groupby([labels]).mean()
Z = linkage(df_scaled_no_outliers, metric='euclidean', method='complete')
c, coph_distances = cophenet(Z, pdist(df_scaled_no_outliers))
c
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold=90, leaf_font_size=10. )
plt.tight_layout()
df_scaled_no_outliers_with_complete_cluster_labels.boxplot(by=labels, layout=(2,4), figsize=(15,10))
ward_sil_score = silhouette_score(df_scaled_no_outliers, complete_clustering_labels)
print("The average silhouette_score for hierarchical (ward) :", ward_sil_score)
df_scaled_no_outliers_with_ward_cluster_labels.groupby([labels]).mean()
Z = linkage(df_scaled_no_outliers, metric='euclidean', method='ward')
c, coph_dists = cophenet(Z , pdist(df_scaled_no_outliers))
c
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold=600, leaf_font_size=10. )
plt.tight_layout()
df_scaled_no_outliers_with_ward_cluster_labels.boxplot(by=labels, layout=(2,4), figsize=(15,10))
KMeans and Hierarchical (ward) look similar, all of the 'total' boxplots of their clusters look almost identical. The only thing that differs is the Avg credit limit
Hierachical (complete) and Hierachical (average) look similar to each other, the only difference looks to be the spread of the datapoints are from the centroid.
The Hierarchical (average) and Hierarchical (complete), according to their boxplots, have smaller 'boxes' which means that the range of the distance from each point to the centroid is much smaller
Like stated above, KMeans and Hierarchical (ward) are similar, they differ from the other two Hierarchical methods in that the range of the distance from each point to the centroid is bigger than the Hierarchical (average) and the Hierarchical (complete)
Additionally, KMeans and Hierarchical (ward) clusters seems to be distributed more evenly across all columns. We can see according to the box plot that the boxes are closer in size to the other boxes in each column. Meaning that the range (or we can think of it as a 'spread') of the distances from each point to the centroid are more congruent to the other groups in that column. Whereas, the Hierarchical (average) and Hierarchical (complete) have some very large boxes (spread) in some of the columns while the other groups of that column are more concentrated around the centroid (low spread)